Data Presentation

Packages Installation

library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(riskRegression)
## riskRegression version 2023.12.21
library(pec)
## Loading required package: prodlim
## 
## Attaching package: 'pec'
## The following objects are masked from 'package:riskRegression':
## 
##     ipcw, selectCox
## The following object is masked from 'package:caret':
## 
##     R2
library(splines)
library(RColorBrewer)
library(survivalROC)
library(randomForestSRC)
## 
##  randomForestSRC 3.3.1 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
## 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data Extraction

df <- read.csv("hiv.csv")
head(df)
##   subject  time death       cd4 time_obs treatment  sex prev_infection
## 1       1 16.97     0 10.677078        0       ddC male           AIDS
## 2       1 16.97     0  8.426150        6       ddC male           AIDS
## 3       1 16.97     0  9.433981       12       ddC male           AIDS
## 4       2 19.00     0  6.324555        0       ddI male         noAIDS
## 5       2 19.00     0  8.124038        6       ddI male         noAIDS
## 6       2 19.00     0  4.582576       12       ddI male         noAIDS
##           azt
## 1 intolerance
## 2 intolerance
## 3 intolerance
## 4 intolerance
## 5 intolerance
## 6 intolerance
str(df)
## 'data.frame':    1405 obs. of  9 variables:
##  $ subject       : int  1 1 1 2 2 2 2 3 3 3 ...
##  $ time          : num  17 17 17 19 19 ...
##  $ death         : int  0 0 0 0 0 0 0 1 1 1 ...
##  $ cd4           : num  10.68 8.43 9.43 6.32 8.12 ...
##  $ time_obs      : int  0 6 12 0 6 12 18 0 2 6 ...
##  $ treatment     : chr  "ddC" "ddC" "ddC" "ddI" ...
##  $ sex           : chr  "male" "male" "male" "male" ...
##  $ prev_infection: chr  "AIDS" "AIDS" "AIDS" "noAIDS" ...
##  $ azt           : chr  "intolerance" "intolerance" "intolerance" "intolerance" ...

Exploratory Data Analysis

Data cleaning

df$treatment <- as.factor(df$treatment)
df$sex <- as.factor(df$sex)
df$prev_infection<-as.factor(df$prev_infection)
df$azt <- as.factor(df$azt)
str(df)
## 'data.frame':    1405 obs. of  9 variables:
##  $ subject       : int  1 1 1 2 2 2 2 3 3 3 ...
##  $ time          : num  17 17 17 19 19 ...
##  $ death         : int  0 0 0 0 0 0 0 1 1 1 ...
##  $ cd4           : num  10.68 8.43 9.43 6.32 8.12 ...
##  $ time_obs      : int  0 6 12 0 6 12 18 0 2 6 ...
##  $ treatment     : Factor w/ 2 levels "ddC","ddI": 1 1 1 2 2 2 2 2 2 2 ...
##  $ sex           : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 1 1 1 ...
##  $ prev_infection: Factor w/ 2 levels "AIDS","noAIDS": 1 1 1 2 2 2 2 1 1 1 ...
##  $ azt           : Factor w/ 2 levels "failure","intolerance": 2 2 2 2 2 2 2 2 2 2 ...
cat('Nombre de données manquantes : ', sum(is.na(df)), '\n')
## Nombre de données manquantes :  0
cat('Nombre de doublons : ', sum(duplicated(df)))
## Nombre de doublons :  0

3)

CD4 Summary :

df$cd4_cat <- cut(df$cd4,
                  breaks = c(0, 7, 15, 24.125),
                  labels = c("Low", "Medium", "High"),
                  right = TRUE)
summary(df$cd4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.162   5.477   7.023  10.440  24.125
histogram_plot = ggplot(df, aes(x = cd4)) +
  geom_histogram(fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogramme de la variable CD4", x = "CD4", y = "Fréquence") +
  theme_minimal()

density_plot = ggplot(df, aes(x = cd4)) +
  geom_density(fill = "darkgreen", alpha = 0.5) +
  labs(title = "Densité de la variable CD4", x = "CD4", y = "Densité") +
  theme_minimal()

histogram_plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

density_plot

ggsave("histogramme_cd4.png", plot = histogram_plot, width = 6, height = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("densite_cd4.png", plot = density_plot, width = 6, height = 4)

Sex distribution:

histo_sex_plot = ggplot(df, aes(x = sex)) +
  geom_bar(fill = "lightblue", color = "black") +
  labs(title = "Distribution du Sexe", x = "Sexe", y = "Nombre d'individus") +
  theme_minimal()

cat("Résumé numérique de la variable sex : \n")
## Résumé numérique de la variable sex :
print(table(df$sex))
## 
## female   male 
##    117   1288
histo_sex_plot

ggsave("histogram_sex.png", plot = histo_sex_plot, width = 6, height = 4)

We notice that the vast majority of patients are men.

cd4_azt_plot = ggplot(df, aes(x = azt, y = cd4, fill = azt)) +
  geom_boxplot() +
  labs(title = "Boxplot de CD4 par Sexe", x = "AZT", y = "CD4") +
  theme_minimal()

cd4_azt_plot

ggsave("boxplot_azt_cd4.png", plot = cd4_azt_plot, width = 6, height = 4)
ggplot(df, aes(x = azt, y = cd4, fill = azt)) +
  geom_boxplot() +
  labs(title = "Boxplot de CD4 par Traitement", x = "Traitement", y = "CD4") +
  theme_minimal()

ggpairs(df[, sapply(df, is.numeric)], 
        title = "Matrice de Corrélation des Variables Continues")

boxplot_infection = ggplot(df, aes(x = interaction(prev_infection, azt), y = cd4, fill = azt)) +
  geom_boxplot() +
  labs(title = "Boxplot de CD4 par Sexe et Infections", x = "Sexe et Inféctions", y = "CD4") +
  theme_minimal()

boxplot_infection

ggsave("boxplot_infection.png", plot = boxplot_infection, width = 6, height = 4)
surv_plot_treatment <- ggsurvplot (
  survfit(Surv(time, death) ~ treatment, data = df),
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  title = "Courbe de Survie par Traitement",
  xlab = " Temps (mois)",
  ylab = "Probabilité de survie",
  legend.title = "Type de Traitement",
  palette = RColorBrewer::brewer.pal(3, "Set1")
)

surv_plot_treatment

logrank_treatment <- survdiff(Surv(time, death) ~ treatment, data = df)
logrank_treatment
## Call:
## survdiff(formula = Surv(time, death) ~ treatment, data = df)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## treatment=ddC 717      200      213     0.843      1.75
## treatment=ddI 688      212      199     0.906      1.75
## 
##  Chisq= 1.8  on 1 degrees of freedom, p= 0.2
ggsave("surv_plot_treatment.png", plot = surv_plot_treatment$plot, width = 8, height = 6, dpi = 300)
fit_sex <- survfit(Surv(time, death) ~ sex, data = df)

survplot_sex = ggsurvplot (
  fit_sex,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  title = "Courbe de Survie par Genre",
  xlab = " Temps (mois)",
  ylab = "Probabilité de survie",
  legend.title = "Genre",
  palette = "viridis"
)
survplot_sex

logrank_sex <- survdiff(Surv(time, death) ~ sex, data = df)
logrank_sex
## Call:
## survdiff(formula = Surv(time, death) ~ sex, data = df)
## 
##               N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=female  117       33     32.5  0.007689   0.00838
## sex=male   1288      379    379.5  0.000658   0.00838
## 
##  Chisq= 0  on 1 degrees of freedom, p= 0.9
ggsave("survplot_sex.png", plot = survplot_sex$plot, width = 8, height = 6, dpi = 300)
fit_cd4 <- survfit(Surv(time, death) ~ cd4_cat, data = df)

surv_plot_cd4 <- ggsurvplot (
  fit_cd4,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  title = "Survival Curve based on CD4 Cell Count",
  xlab = " Times (months)",
  ylab = "Survival probability",
  legend.title = "CD4 Cell count. \n", 
  palette = c("#E7B800", "#2E9FDF", "darkgreen")  
)
surv_plot_cd4

ggsave("surv_plot_cd4.png", plot = surv_plot_cd4$plot, width = 8, height = 6, dpi = 300)
fit_infection <- survfit(Surv(time, death) ~ prev_infection, data = df)

surv_plot_infection <- ggsurvplot (
  fit_infection,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  title = "Courbe de Survie selon les antécédents d'infection (SIDA)",
  xlab = " Temps (mois)",
  ylab = "Probabilité de survie",
  legend.title = "Antécédents d'Infection \n", 
  palette = c("#E7B800", "#2E9FDF")  
)

logrank_infection <- survdiff(Surv(time, death) ~ prev_infection, data = df)
logrank_infection
## Call:
## survdiff(formula = Surv(time, death) ~ prev_infection, data = df)
## 
##                         N Observed Expected (O-E)^2/E (O-E)^2/V
## prev_infection=AIDS   863      344      232      53.9       125
## prev_infection=noAIDS 542       68      180      69.6       125
## 
##  Chisq= 125  on 1 degrees of freedom, p= <2e-16
ggsave("surv_plot_infection.png", plot = surv_plot_infection$plot, width = 8, height = 6, dpi = 300)
surv_plot_azt <- ggsurvplot (
  survfit(Surv(time, death) ~ azt, data = df),
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  title = "Courbe de Survie par Genre",
  xlab = " Temps (mois)",
  ylab = "Probabilité de survie",
  legend.title = "Genre",
  palette = RColorBrewer::brewer.pal(3, "Dark2")
)
surv_plot_azt

logrank_azt <- survdiff(Surv(time, death) ~ treatment, data = df)
logrank_azt
## Call:
## survdiff(formula = Surv(time, death) ~ treatment, data = df)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## treatment=ddC 717      200      213     0.843      1.75
## treatment=ddI 688      212      199     0.906      1.75
## 
##  Chisq= 1.8  on 1 degrees of freedom, p= 0.2
ggsave("surv_plot_azt.png", plot=surv_plot_azt$plot, width=8, height=6, dpi=300)

B. Statistical modelling and analysis

7)

df <- df %>% select(-cd4_cat)
train <- createDataPartition(df$death, p = 0.7, list = FALSE)

train_data <- df[train,]
test_data <- df[-train,]

cat("Distribution de la variable 'death' dans l'ensemble d'entraînement :\n")
## Distribution de la variable 'death' dans l'ensemble d'entraînement :
table(train_data$death)
## 
##   0   1 
## 698 286
cat("\nDistribution de la variable 'death' dans l'ensemble de test :\n")
## 
## Distribution de la variable 'death' dans l'ensemble de test :
table(test_data$death)
## 
##   0   1 
## 295 126

8)

train_data$prev_infection <- relevel(train_data$prev_infection, ref = "AIDS")
train_data$azt <- relevel(train_data$azt, ref = "failure")

cox <- coxph(Surv(time, death) ~prev_infection + azt, data = train_data, x = TRUE)

cox
## Call:
## coxph(formula = Surv(time, death) ~ prev_infection + azt, data = train_data, 
##     x = TRUE)
## 
##                          coef exp(coef) se(coef)      z      p
## prev_infectionnoAIDS -1.53564   0.21532  0.18100 -8.484 <2e-16
## aztintolerance        0.03934   1.04012  0.12945  0.304  0.761
## 
## Likelihood ratio test=113.5  on 2 df, p=< 2.2e-16
## n= 984, number of events= 286
cox_zph <- cox.zph(cox)

print(cox_zph)
##                chisq df     p
## prev_infection  3.23  1 0.072
## azt             2.67  1 0.102
## GLOBAL          4.12  2 0.128
ggcoxzph(cox_zph)

Cela signifie que les effets des covariables sur le risque de survie sont constants dans le temps. Cela valide l’utilisation d’un modèle de Cox avec ces covariables.

Modèle Généralisé

train_data$azt <- factor(train_data$azt)
train_data$prev_infection <- factor(train_data$prev_infection)

cox_nl <- coxph(Surv(time, death) ~ prev_infection + azt + ns(cd4, df = 2), data = train_data)

summary(cox_nl)
## Call:
## coxph(formula = Surv(time, death) ~ prev_infection + azt + ns(cd4, 
##     df = 2), data = train_data)
## 
##   n= 984, number of events= 286 
## 
##                          coef exp(coef) se(coef)      z Pr(>|z|)    
## prev_infectionnoAIDS -1.10477   0.33129  0.18551 -5.955 2.59e-09 ***
## aztintolerance        0.09237   1.09677  0.12901  0.716 0.473990    
## ns(cd4, df = 2)1     -3.24233   0.03907  0.47228 -6.865 6.64e-12 ***
## ns(cd4, df = 2)2     -3.30297   0.03677  0.95808 -3.447 0.000566 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                      exp(coef) exp(-coef) lower .95 upper .95
## prev_infectionnoAIDS   0.33129     3.0185  0.230303    0.4765
## aztintolerance         1.09677     0.9118  0.851728    1.4123
## ns(cd4, df = 2)1       0.03907    25.5934  0.015483    0.0986
## ns(cd4, df = 2)2       0.03677    27.1931  0.005624    0.2405
## 
## Concordance= 0.71  (se = 0.015 )
## Likelihood ratio test= 174.1  on 4 df,   p=<2e-16
## Wald test            = 114.3  on 4 df,   p=<2e-16
## Score (logrank) test = 150.6  on 4 df,   p=<2e-16
cox_zph_nl <- cox.zph(cox_nl)

print(cox_zph_nl)
##                 chisq df     p
## prev_infection   4.01  1 0.045
## azt              2.33  1 0.127
## ns(cd4, df = 2)  3.16  2 0.206
## GLOBAL           7.65  4 0.105
ggcoxzph(cox_zph_nl)

anova(cox, cox_nl, test = "LRT")
## Analysis of Deviance Table
##  Cox model: response is  Surv(time, death)
##  Model 1: ~ prev_infection + azt
##  Model 2: ~ prev_infection + azt + ns(cd4, df = 2)
##    loglik  Chisq Df Pr(>|Chi|)    
## 1 -1811.3                         
## 2 -1781.0 60.644  2  6.781e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

10)

patient_1 <- data.frame(prev_infection = "noAIDS", azt = "failure") 
patient_2 <- data.frame(prev_infection = "AIDS", azt = "intolerance")

patient_1_nl <- data.frame(prev_infection = "noAIDS", azt = "failure", cd4 = 20) 
patient_2_nl <- data.frame(prev_infection = "AIDS", azt = "intolerance", cd4 = 2)

surv_patient_1_cox <- survfit(cox, newdata = patient_1)
surv_patient_2_cox <- survfit(cox, newdata = patient_2)

surv_patient_1_cox_nl <- survfit(cox_nl, newdata = patient_1_nl)
surv_patient_2_cox_nl <- survfit(cox_nl, newdata = patient_2_nl)


prob_18_months_patient_1 <- summary(surv_patient_1_cox, times = 18)$surv
prob_18_months_patient_2 <- summary(surv_patient_2_cox, times = 18)$surv

prob_18_months_patient_1_nl <- summary(surv_patient_1_cox_nl, times = 18)$surv
prob_18_months_patient_2_nl <- summary(surv_patient_2_cox_nl, times = 18)$surv

cat("The probability that these two patients will survive beyond 18 months with the Linear Model : \n")
## The probability that these two patients will survive beyond 18 months with the Linear Model :
cat("First Patient : - ", prob_18_months_patient_1, "\n")
## First Patient : -  0.8760176
cat("Second Patient : -", prob_18_months_patient_2, "\n")
## Second Patient : - 0.5275944
cat("\nThe probability that these two patients will survive beyond 18 months with the Non linear Model : \n")
## 
## The probability that these two patients will survive beyond 18 months with the Non linear Model :
cat("First Patient : - ", prob_18_months_patient_1_nl, "\n")
## First Patient : -  0.9812692
cat("Second Patient : - ", prob_18_months_patient_2_nl, "\n")
## Second Patient : -  0.3875033

11)

patient_1_12_months <- data.frame(prev_infection = "noAIDS", azt = "failure", cd4 = 20)
patient_2_12_months <- data.frame(prev_infection = "AIDS", azt = "intolerance", cd4 = 2)

surv_patient_1_12_months <- survfit(cox_nl, newdata = patient_1_12_months)
surv_patient_2_12_months <- survfit(cox_nl, newdata = patient_2_12_months)

prob_patient_1_12_months <- summary(surv_patient_1_12_months, times = 12)$surv
prob_patient_2_12_months <- summary(surv_patient_2_12_months, times = 12)$surv

prob_patient_1_18_months <- summary(surv_patient_1_12_months, times = 18)$surv
prob_patient_2_18_months <- summary(surv_patient_2_12_months, times = 18)$surv

prob_cond_patient_1 <- prob_patient_1_18_months / prob_patient_1_12_months
prob_cond_patient_2 <- prob_patient_2_18_months / prob_patient_2_12_months

cat("First Patient - Conditional probability of survival between 12 and 18 months : ", prob_cond_patient_1, "\n")
## First Patient - Conditional probability of survival between 12 and 18 months :  0.9911895
cat("Individu âgé - Probabilité conditionnelle de survie entre 12 et 18 mois : ", prob_cond_patient_2, "\n")
## Individu âgé - Probabilité conditionnelle de survie entre 12 et 18 mois :  0.6416609

12)

rf <- rfsrc(Surv(time, death) ~ prev_infection + azt + cd4, 
            data = train_data, 
            ntree = 200)

rf_opti = rfsrc(
  Surv(time, death) ~ prev_infection + azt + cd4,
  data = train_data,
  ntree = 200,
  mtry = 1,
  nodesize = 5
)
cox_pred <- predict(cox_nl, newdata = test_data, type = "risk")
rf_pred <- predict(rf, newdata = test_data, type = "response")
rf_opti_pred <- predict(rf_opti, newdata = test_data, type = "response")

rf_pred_probs <- rf_pred$predicted

rf_pred_opti_probs = rf_opti_pred$predicted

13)

cox_roc <- survivalROC(
  Stime = test_data$time, 
  status = test_data$death, 
  marker = cox_pred,
  predict.time = 18,
  method = "KM"
)

cox_auc <- cox_roc$AUC

rf_roc <- survivalROC(
  Stime = test_data$time,
  status = test_data$death,
  marker = rf_pred_probs,
  predict.time = 18,
  method = "KM"
)

rf_auc <- rf_roc$AUC

rf_opti_roc <- survivalROC(
  Stime = test_data$time,
  status = test_data$death,
  marker = rf_pred_opti_probs,
  predict.time = 18,
  method = "KM"
)

rf_opti_auc <- rf_opti_roc$AUC

roc_data_cox <- data.frame(
  FPR = cox_roc$FP,
  TPR = cox_roc$TP,
  Model = "Cox",
  AUC = round(cox_roc$AUC, 3)
)

roc_data_rf <- data.frame(
  FPR = rf_roc$FP,
  TPR = rf_roc$TP,
  Model = "RF",
  AUC = round(rf_roc$AUC, 3)
)

roc_data_rf_opti <- data.frame(
  FPR = rf_opti_roc$FP,
  TPR = rf_opti_roc$TP,
  Model = "RF Optimisé",
  AUC = round(rf_opti_roc$AUC, 3)
)

roc_data <- rbind(roc_data_cox, roc_data_rf, roc_data_rf_opti)

roc_curve = ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
  geom_line(size = 1) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  labs(
    title = paste("ROC Curves (AUC)"),
    x = "False Positive Rate (FPR)",
    y = "True Positive Rate (TPR)",
    subtitle = paste("Cox AUC =", cox_auc, "\nRF AUC =", rf_auc, "\nRF Optimized, AUC =", rf_opti_auc)
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggsave("roc_curve.png", plot=roc_curve, width=8, height=6, dpi=300)
times <- seq(0, max(test_data$time), by = 5)

brier_score <- Score(
  object = list("Cox" = cox),
  formula = Surv(time, death) ~ 1,
  data = test_data,
  times = times,
  metrics = "Brier"
)
brier_times <- brier_score$times
brier_scores <- brier_score$Brier$score

str(brier_score$times)
##  num [1:5] 0 5 10 15 20
str(brier_score$Brier$score)
## Classes 'data.table' and 'data.frame':   10 obs. of  6 variables:
##  $ model: Factor w/ 2 levels "Null model","Cox": 1 1 1 1 1 2 2 2 2 2
##  $ times: num  0 5 10 15 20 0 5 10 15 20
##  $ Brier: num  0 0.0474 0.1289 0.2024 0.2357 ...
##  $ se   : num  0 0.00956 0.01219 0.01001 0.00875 ...
##  $ lower: num  0 0.0287 0.105 0.1828 0.2186 ...
##  $ upper: num  0 0.0661 0.1528 0.222 0.2529 ...
##  - attr(*, ".internal.selfref")=<externalptr>
embedded <- function(brier_results) {
  times <- brier_results$times
  brier_scores <- brier_results$Brier$score
  
  intervals <- diff(times)
  weights <- c(intervals, 0) / sum(intervals)
  
  # Calculer le score intégré
  embedded_score <- sum(brier_scores * weights, na.rm = TRUE)
  return(embedded_score)
}

embedded_score <- embedded(brier_score)
## Warning in Ops.factor(left, right): '*' not meaningful for factors
cat("Embedded Score: ", embedded_score, "\n")
## Embedded Score:  15.57423